{
 "cells": [
  {
   "cell_type": "markdown",
   "metadata": {
    "papermill": {
     "duration": 0.010186,
     "end_time": "2020-06-23T18:38:06.385853",
     "exception": false,
     "start_time": "2020-06-23T18:38:06.375667",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "## Download a data set\n",
    "\n",
    "This notebook downloads a data set file from a public location. If the data set file is a compressed archive it will be decompressed. Upon completion the raw data set files  are located in the `data\\` directory.\n",
    "\n",
    "This notebook requires the following environment variables:\n",
    " -  `DATASET_URL` Public data set URL, e.g. `https://dax-cdn.cdn.appdomain.cloud/dax-fashion-mnist/1.0.2/fashion-mnist.tar.gz`"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "papermill": {
     "duration": 0.115145,
     "end_time": "2020-06-23T18:38:06.509214",
     "exception": false,
     "start_time": "2020-06-23T18:38:06.394069",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "import os\n",
    "from pathlib import Path\n",
    "import requests\n",
    "import tarfile\n",
    "from urllib.parse import urlparse"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "papermill": {
     "duration": 0.008412,
     "end_time": "2020-06-23T18:38:06.526862",
     "exception": false,
     "start_time": "2020-06-23T18:38:06.518450",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "Verify that the `DATASET_URL` environment variable is set. If it is not set, a RuntimeError is raised."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "papermill": {
     "duration": 0.035693,
     "end_time": "2020-06-23T18:38:06.569042",
     "exception": false,
     "start_time": "2020-06-23T18:38:06.533349",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "if os.environ.get('DATASET_URL') is None:\n",
    "    raise RuntimeError('Cannot run notebook. Required parameter is missing.')\n",
    "    \n",
    "data_file = os.environ['DATASET_URL']"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {
    "papermill": {
     "duration": 0.007602,
     "end_time": "2020-06-23T18:38:06.585252",
     "exception": false,
     "start_time": "2020-06-23T18:38:06.577650",
     "status": "completed"
    },
    "tags": []
   },
   "source": [
    "Download the data set from the location specified in `dataset_url`, extract it (if it is compressed) and store it in the directory identified by `data_dir_name`."
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "papermill": {
     "duration": 1.224952,
     "end_time": "2020-06-23T18:38:07.819301",
     "exception": false,
     "start_time": "2020-06-23T18:38:06.594349",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "data_dir_name = 'data'\n",
    "\n",
    "print('Downloading data file {} ...'.format(data_file))\n",
    "r = requests.get(data_file)\n",
    "if r.status_code != 200:\n",
    "    raise RuntimeError('Could not fetch {}: HTTP status code {}'.format(data_file, r.status_code))\n",
    "else:\n",
    "    # extract data set file name from URL \n",
    "    data_file_name = Path((urlparse(data_file).path)).name\n",
    "    # create the directory where the downloaded file will be stored\n",
    "    data_dir = Path(data_dir_name)    \n",
    "    data_dir.mkdir(parents=True, exist_ok=True)\n",
    "    downloaded_data_file = data_dir / data_file_name\n",
    "\n",
    "    print('Saving downloaded file \"{}\" as ...'.format(data_file_name))\n",
    "    with open(downloaded_data_file, 'wb') as downloaded_file:\n",
    "        downloaded_file.write(r.content)\n",
    "    \n",
    "    if r.headers['content-type'] == 'application/x-tar':\n",
    "        print('Extracting downloaded file in directory \"{}\" ...'.format(data_dir))\n",
    "        with tarfile.open(downloaded_data_file, 'r') as tar:\n",
    "            tar.extractall(data_dir)\n",
    "        print('Removing downloaded file ...')\n",
    "        downloaded_data_file.unlink()"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Display list of extracted data files"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "papermill": {
     "duration": 1.130282,
     "end_time": "2020-06-23T18:38:08.962229",
     "exception": false,
     "start_time": "2020-06-23T18:38:07.831947",
     "status": "completed"
    },
    "tags": []
   },
   "outputs": [],
   "source": [
    "! ls data/"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.7.6"
  },
  "papermill": {
   "duration": 3.951724,
   "end_time": "2020-06-23T18:38:09.091307",
   "environment_variables": {},
   "exception": null,
   "input_path": "load_data.ipynb",
   "output_path": "load_data-output.ipynb",
   "parameters": {},
   "start_time": "2020-06-23T18:38:05.139583",
   "version": "2.1.1"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
